subdiagnosis <- readr::read_tsv(
file.path("..", "..", "..", "data", "current", params$scpca_project_id, "single_cell_metadata.tsv"),
show_col_types = FALSE
) |>
dplyr::filter(scpca_sample_id == params$sample_id) |>
dplyr::pull(subdiagnosis)The aim is to explore the clustering and label transfers for the sample SCPCS000170. This sample has a(n) Favorable subdiagnosis.
In order to explore the clustering results, we look into some marker genes, pathways enrichment and label transfer.
This approach would provide us a rapid idea of the quality of the clustering:
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", params$scpca_project_id)
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")In this notebook, we are working with the Wilms tumor sample defined
in SCPCS000170 from the Wilms tumor dataset SCPCP000006. We work with
the pre-processed and labeled Seurat object that is the
output of
02b_label-transfer_fetal_kidney_reference_Stewart.Rmd saved
in the results directory.
To explore the clustering results, we look into some marker genes reported here:
Reports will be saved in the notebook directory. The
pre-processed and annotated Seurat object per samples are
saved in the result folder.
Here we defined function that will be used multiple time all along the notebook.
For a Seurat object objectand a metadata
metadata, the function visualize_metadata will
plot FeaturePlot and BarPlot
object is the Seurat object
metadata the gene or quantitative value to be
plotted
group.by is the metadata used for grouping the
violin plots
visualize_metadata <- function(object, meta, group.by){
if(is.numeric(object@meta.data[,meta])){
d <- SCpubr::do_FeaturePlot(object,
features = meta,
pt.size = 0.2,
legend.width = 0.5,
legend.length = 5,
legend.position = "right") + ggtitle(meta)
b <- SCpubr::do_ViolinPlot(srat,
features = meta,
ncol = 1,
group.by = group.by,
legend.position = "none")
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}
else{
d <- SCpubr::do_DimPlot(object, reduction="umap", group.by = group.by, label = TRUE, repel = TRUE) + ggtitle(paste0(meta," - umap")) + theme(text=element_text(size=18))
b <- SCpubr::do_BarPlot(sample = object,
group.by = meta,
split.by = group.by,
position = "fill",
font.size = 10,
legend.ncol = 3) +
ggtitle("% cells")+
xlab(print(group.by)) +
theme(text=element_text(size=18))
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}
}For a Seurat object objectand a features
features, the function visualize_feature will
plot FeaturePlot and ViolinPlot
object is the Seurat object
features the gene or quantitative value to be
plotted
group.by is the metadata used for grouping the
violin plots
visualize_feature <- function(object, features, group.by = "seurat_clusters"){
feature_symbol <- AnnotationDbi::select(org.Hs.eg.db,
keys=features,
columns="SYMBOL",
keytype="ENSEMBL")
d <- SCpubr::do_FeaturePlot(object,
features = feature_symbol$ENSEMBL,
pt.size = 0.2,
legend.width = 0.5,
legend.length = 5,
legend.position = "right") + ggtitle(feature_symbol$SYMBOL)
b <- SCpubr::do_ViolinPlot(srat,
features = feature_symbol$ENSEMBL,
ncol = 1,
group.by = group.by,
legend.position = "none",
assay = "SCT") + ylab(feature_symbol$SYMBOL)
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}ModuleScore from a MSigDB
datasetFor a Seurat object object, the function
MSigDB_score will calculate a score using
AddModuleScore() function for a MSigDB gene set
gs_name in the category category
object is the Seurat object to calculate the score.
Score will be added in the metadata of this object.
category is a MSigDB collection (https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp).
values in
c("H", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8")
gs_name is the name of a MSigDB gene set,
e.g. "HALLMARK_P53_PATHWAY"
name is the name of the module
nbins is the number of bins for
AddModuleScore to use. Its default value is 24, which
matches Seurat’s default value.
MSigDB_score <- function(object, category, gs_name, name, nbin = 24){
set <- msigdbr(species = "human", category = category)
set_list <- set %>%
dplyr::filter(gs_name == gs_name) %>%
dplyr::distinct(gs_name, gene_symbol, human_ensembl_gene) %>% as.data.frame()
set_list <- list(set_list$human_ensembl_gene)
suppressWarnings({
object <- AddModuleScore(object, features = set_list, name = name, nbin = nbin)
})
return(object)
}Enrichment_plotaim to perform enrichment of the marker
genes for each of the Seurat clusters and summarize the results in a
dotplot.
category is the MSigDB category or collection to be
used values in
c("H", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8")
signatures is a list of marker genes per
cluster
backgroundis the universe used for
enrichment
Enrichment_plot <- function(category, signatures, background){
## define genesets
tryCatch( expr = {
gene_set <- msigdbr(species = "human", category = category)
msigdbr_set <- gene_set %>% dplyr::distinct(gs_name, ensembl_gene) %>% as.data.frame()
cclust<-compareCluster(geneCluster = signatures,
fun = enricher,
TERM2GENE = msigdbr_set,
universe=background)
d <- dotplot(cclust,showCategory=15) + scale_y_discrete(labels=function(x) str_wrap(x, width=40))
return(d)
},
error = function(e) { print("No enrichment") } )
}do_Table_Heatmap shows heatmap of counts of cells for
combinations of two metadata variables
data seurat objectfirst_group is the name of the first metadata to group
the cellslast_group is the name of the second metadata to group
the cellsdo_Table_Heatmap <- function(data, first_group, last_group ){
df <- data@meta.data %>%
mutate_if(sapply(data@meta.data, is.character), as.factor) %>%
group_by( !!sym(first_group), !!sym(last_group))%>%
summarise(Nb = log(n()))
p <- ggplot(df, aes(x= !!sym(first_group), y = !!sym(last_group), fill = Nb)) +
geom_tile() +
scale_fill_viridis_c() +
theme_bw() +
theme(text = element_text(size = 20)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
guides(fill=guide_legend(title=" Number of \n cells (log)"))
return(p)
}Seurat object# open the processed rds object
srat <- readRDS(file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_",params$sample_id,".Rds")))
DefaultAssay(srat) <- "SCT"
# Add number of bins to use with Seurat::AddModuleScore().
# One sample in this project fails with the default number of bins, so we need to override it
if (params$sample_id == "SCPCS000197") {
seurat_nbins <- 23 # default is 24, so this is very close
} else {
seurat_nbins <- 24
}We expect up to 5 set of clusters:
d1 <- SCpubr::do_DimPlot(srat, reduction="pca", group.by = "seurat_clusters", label = TRUE) + ggtitle("Seurat Cluster - pca")
d2 <- SCpubr::do_DimPlot(srat, reduction="umap", group.by = "seurat_clusters", label = TRUE) + ggtitle("Seurat Cluster - umap")
v1 <- SCpubr::do_ViolinPlot(srat, features = c( "subsets_mito_percent"), ncol = 1, group.by = "seurat_clusters", legend.position = "none")
v2 <- SCpubr::do_ViolinPlot(srat, features = c( "detected"), ncol = 1, group.by = "seurat_clusters", legend.position = "none")
v3 <- SCpubr::do_ViolinPlot(srat, features = c( "sum"), ncol = 1, group.by = "seurat_clusters", legend.position = "none")
d1 + d2 + (v1 + v2 +v3 + plot_layout(ncol=1) ) + plot_layout(ncol = 3, widths = c(2,2,4))s.genes <- srat@assays$RNA@meta.data$gene_ids[srat@assays$RNA@meta.data$gene_symbol %in% cc.genes$s.genes]
g2m.genes <- srat@assays$RNA@meta.data$gene_ids[srat@assays$RNA@meta.data$gene_symbol %in% cc.genes$g2m.genes]
srat <- CellCycleScoring(srat, s.features = s.genes, g2m.features = g2m.genes, set.ident = FALSE, nbin = seurat_nbins)## [1] "seurat_clusters"
Here, we open the table of marker genes marker-sets/CellType_metadata.csv. Note: we do not expect to have a clear and nice pattern of expression for all of the following markers in every tumor. This is just ti get a few idea.
for(feature in CellType_metadata$ENSEMBL_ID[CellType_metadata$ENSEMBL_ID %in% rownames(srat@assays$SCT)]){
print(visualize_feature(srat, features = feature, group.by = "seurat_clusters"))
}